home *** CD-ROM | disk | FTP | other *** search
/ Shareware Grab Bag / Shareware Grab Bag.iso / 050 / bix02.arc / INVERT.PAS < prev    next >
Pascal/Delphi Source File  |  1986-08-04  |  16KB  |  381 lines

  1. (* INVERT.PAS   version 0.10   3/23/86  *)
  2. (* Adds support for file size up to 4 megabytes *)
  3.  
  4.  
  5. (*******************************************************************)
  6. (*                                                                 *)
  7. (*                          INVERT                                 *)
  8. (*                                                                 *)
  9. (*              Copyright 1986 by Mark J. Welch                    *)
  10. (*                P.O. Box 2409, SF, CA 94126                      *)
  11. (*                                                                 *)
  12. (*              Portions copyright 1986 by JimKeo                  *)
  13. (*                                                                 *)
  14. (*                Last revised 3/22/86 by MJW                      *)
  15. (*                                                                 *)
  16. (*   1) Read text files and "invert" them into a sorted list of    *)
  17. (*      words and their locations within the file.                 *)
  18. (*   2) Store inversion to disk file                               *)
  19. (*   3) Later, modify program to allow unlimited expandability     *)
  20. (*      of inversion file and easy search/access to keywords       *)
  21. (*                                                                 *)
  22. (*   Words are stored in a binary tree, and location pointers      *)
  23. (*   are stored in a linked list rooted at each tree entry.        *)
  24. (*                                                                 *)
  25. (*******************************************************************)
  26.  
  27. (**********************************************************************)
  28. (*                                                                    *)
  29. (*                          WARNING                                   *)
  30. (*                                                                    *)
  31. (* WARNING: To save on execution time, the program does not now check *)
  32. (* available heap space before calling New(). As a result, it will    *)
  33. (* crash if it runs out of memory, usually after about an hour of     *)
  34. (* execution; if it crashes, all work is lost. It also crashes if too *)
  35. (* many files are inverted at one time. This error-handling should be *)
  36. (* improved in any "release" version. -MJW                            *)
  37. (*                                                                    *)
  38. (**********************************************************************)
  39.  
  40. Program Invert;
  41. (* Invert a text file for quick searching, etc. *)
  42.  
  43. CONST
  44.  
  45.   FileNameLength = 20;   (* 14 is probably sufficient, but... *)
  46.   WordLength = 14;       (* truncate words at 14 characters   *)
  47.   MaxFiles = 256;        (* maximum number of files to invert *)
  48.                          (* -- needed for FileList array, but *)
  49.                          (* could be replaced with a linked   *)
  50.                          (* list later.                       *)
  51. TYPE
  52.  
  53.   S = String[WordLength];
  54.   FileName = String[FileNameLength];
  55.   FileNum  = Integer;
  56.   FileLists = Array[1..MaxFiles] of FileName;
  57.  
  58.   LocationPtr = ^LocationType;
  59.   LocationType = RECORD
  60.                    F: FileNum;          (* index into FileList *)
  61.                    FilBlockPtr: Integer;(* 128-byte block offset *)
  62.                    FilPosPtr: Byte;     (* byte offset in file *)
  63.                    Next: LocationPtr;   (* next location in linked list *)
  64.                  END;
  65.  
  66.   WordPtr     = ^WordType;
  67.   WordType    = RECORD
  68.                   Location: LocationPtr; (* pointer into file *)
  69.                   Parent: WordPtr;       (* in b-tree *)
  70.                   SmallerChild: WordPtr; (* less-than *)
  71.                   GreaterChild: WordPtr; (* greater-than *)
  72.                   Text: S;               (* the word *)
  73.                 END;
  74.  
  75. VAR
  76.  
  77.   FileList: FileLists;  (* list of files that are being inverted *)
  78.   NumFiles: Integer;    (* number of files inverted so far *)
  79.   WordRoot: WordPtr;    (* root of the b-tree *)
  80.   i: integer;           (* index for FOR loop *)
  81.  
  82.  
  83.  
  84.  
  85.  
  86.  
  87. (****************************************************************)
  88. (*                                                              *)
  89. (*                      PrintResults                            *)
  90. (*                                                              *)
  91. (*      Print the entire tree in alphabetical order to a file   *)
  92. (*                                                              *)
  93. (****************************************************************)
  94.  
  95.  
  96. Procedure PrintResults;
  97.   var outfil: text;
  98.  
  99.       i: integer;
  100.  
  101.     (***********************************************)
  102.     (*                                             *)
  103.     (*               PrintTree                     *)
  104.     (*                                             *)
  105.     (*  RECURSIVE: print entire branch of tree by  *)
  106.     (*  first printing entire Smaller branch, then *)
  107.     (*  current node text, and then entire Larger  *)
  108.     (*                                             *)
  109.     (***********************************************)
  110.     Procedure PrintTree(X: WordPtr);
  111.       Var L: LocationPtr;
  112.           i: integer;
  113.           r: real;
  114.     Begin
  115.       If X^.SmallerChild <> Nil         (* leftmost children first *)
  116.         then PrintTree(X^.SmallerChild);
  117.       L := X^.Location;
  118.       If L <> Nil                       (* Then current node *)
  119.         then
  120.           begin
  121.             for i := 2 to ord(x^.text[0]) do
  122.               if ((x^.text[i] >= 'A') and (x^.text[i] <= 'Z'))
  123.                 then x^.text[i] := chr(ord(x^.text[i])+32);
  124.             Write(outfil,X^.Text,': ');
  125.             Repeat
  126.               r := (L^.FilBlockPtr * 128.0) +L^.FilPosPtr;
  127.               Write(outfil,L^.F:1,'-',r:1:0,' ');
  128.               L := L^.Next;
  129.             Until L = Nil;
  130.             Writeln(outfil);
  131.           end;
  132.        If X^.GreaterChild <> Nil        (* and then right children *)
  133.         then PrintTree(X^.GreaterChild);
  134.     End; (*PrintTree *)
  135.  
  136. begin (* PrintResults *)
  137.   Assign(outfil,'INVERT.INX');
  138.   Rewrite(outfil);
  139.   for i := 1 to NumFiles do
  140.     Writeln(outfil, i:1,'   ',FileList[i]);
  141.   writeln(outfil);
  142.   PrintTree(WordRoot);
  143.   writeln(outfil);
  144.   close(outfil);
  145. end; (* printResults *)
  146.  
  147.  
  148.  
  149. (*******************************************************************)
  150. (*                                                                 *)
  151. (*                        Title                                    *)
  152. (*                                                                 *)
  153. (*******************************************************************)
  154.  
  155. Procedure Title;
  156. Begin
  157.   Writeln;
  158.   Writeln('         Invert  --  Version 0.10  --  3/23/86');
  159.   Writeln;
  160.   Writeln('Copyright 1986 by Mark J. Welch, Box 2409, SF CA 94126');
  161.   Writeln('       (Portions Copyright 1986 by Jim Keohane)');
  162.   Writeln;
  163.   Writeln;
  164. End;
  165.  
  166.  
  167. (*******************************************************************)
  168. (*                                                                 *)
  169. (*                      MemoryAvail                                *)
  170. (*                                                                 *)
  171. (*******************************************************************)
  172. Function MemoryAvail: Real;
  173.   Var M: Real;
  174. begin
  175.   M := Memavail;
  176.   If M < 0 then M := 65536.0 + M;
  177.   MemoryAvail := M * 16;
  178. end;
  179.  
  180.  
  181. (*******************************************************************)
  182. (*                                                                 *)
  183. (*                      CreateRoot                                 *)
  184. (*                                                                 *)
  185. (*      The binary tree has to start somewhere: start it here.     *)
  186. (*                                                                 *)
  187. (*******************************************************************)
  188. Procedure CreateRoot;
  189. Begin
  190.   New(WordRoot);
  191.   WordRoot^.GreaterChild := Nil;
  192.   WordRoot^.SmallerChild := Nil;
  193.   WordRoot^.Text := 'Mzzzzzzzzz'; (* let's split the alphabet here   *)
  194.                                   (* to improve initial tree balance *)
  195.   WordRoot^.Location := Nil;
  196.   WordRoot^.Parent := Nil; (* no parent for root *)
  197. End; (* CreateRoot *)
  198.  
  199.  
  200.  
  201. (****************************************************************)
  202. (*                                                              *)
  203. (*                      InvertFil                               *)
  204. (*                                                              *)
  205. (*      Given a file, add all its words and their locations     *)
  206. (*      to the inversion tree.                                  *)
  207. (*                                                              *)
  208. (****************************************************************)
  209.  
  210. Procedure InvertFile(FN: FileName);
  211. var block:array[0..127] of char;
  212.     j,k:integer;
  213.     Fil: file;
  214.     St: S;
  215.     StPtr,CurrentLoc: LocationPtr;
  216.     c: char;
  217.     i: integer;
  218.     matchkey: boolean;
  219.  
  220.  
  221.       (****************************************************************)
  222.       (*                                                              *)
  223.       (*                        AddWord                               *)
  224.       (*                                                              *)
  225.       (*      Given a word and a prepared location link/pointer,      *)
  226.       (*      add it into the inversion structure, either as a new    *)
  227.       (*      word or onto an existing linked-list.                   *)
  228.       (*                                                              *)
  229.       (****************************************************************)
  230.       Procedure AddWord(St: S; StLoc: LocationPtr);
  231.       Var CurrentWord: WordPtr;
  232.           Match: Boolean;
  233.  
  234.  
  235.              (**********************************************)
  236.              (*                                            *)
  237.              (*               NewChild                     *)
  238.              (*                                            *)
  239.              (*     Add St as a new word in tree           *)
  240.              (*                                            *)
  241.              (**********************************************)
  242.              Procedure NewChild(var X: WordPtr; var St: s);
  243.                 (* match is imported as a "global" variable *)
  244.              begin
  245.                Match := true;
  246.                GetMem(X,sizeof(X^)-WordLength+Length(St));
  247.                X^.text := St;
  248.                X^.Parent := CurrentWord;
  249.                X^.SmallerChild := Nil;
  250.                X^.GreaterChild := Nil;
  251.                X^.Location := StLoc;
  252.              end; (* NewChild *)
  253.  
  254.  
  255.       Begin (* AddWord *)
  256.         (* First search if it exists *)
  257.  
  258.         CurrentWord := WordRoot; (* start at root of tree *)
  259.         Match := False;          (* haven't found the right place yet *)
  260.         Repeat
  261.           If (CurrentWord^.Text[1] = St[1])
  262.             then
  263.               if (CurrentWord^.Text = St)
  264.                 then begin
  265.                        match := true; (* global *)
  266.                        CurrentLoc := CurrentWord^.Location;
  267.                        While CurrentLoc^.Next <> Nil Do
  268.                          CurrentLoc := CurrentLoc^.Next;
  269.                        CurrentLoc^.Next := StLoc;
  270.                      end
  271.                 else if (CurrentWord^.Text > St)
  272.                   then if CurrentWord^.SmallerChild <> Nil
  273.                          then CurrentWord := CurrentWord^.SmallerChild
  274.                          else NewChild(CurrentWord^.SmallerChild,st)
  275.                   else if CurrentWord^.GreaterChild <> Nil
  276.                          then CurrentWord := CurrentWord^.GreaterChild
  277.                          else NewChild(CurrentWord^.GreaterChild,st)
  278.             else
  279.               if (CurrentWord^.Text[1] > St[1])
  280.                 then if CurrentWord^.SmallerChild <> Nil
  281.                        then CurrentWord := CurrentWord^.SmallerChild
  282.                        else NewChild(CurrentWord^.SmallerChild,st)
  283.                 else if CurrentWord^.GreaterChild <> Nil
  284.                        then CurrentWord := CurrentWord^.GreaterChild
  285.                        else NewChild(CurrentWord^.GreaterChild,st);
  286.         Until Match;
  287.       End; (* AddWord *)
  288.  
  289.  
  290.  
  291.  
  292.  
  293. Begin   (* InvertFil *)
  294. (* JimKeo-modified code included in this procedure *)
  295.  
  296.   Writeln('Invert: ',FN); (* let user know what file we're fiddling with *)
  297.   NumFiles := NumFiles + 1;
  298.   If NumFiles > MaxFiles (* remove this when FileList is a linked list *)
  299.     then
  300.       begin (* crash impolitely, trashing all work done so far *)
  301.         writeln('Too many files inverted: maximum is ',MaxFiles);
  302.         Halt;
  303.       end
  304.     else FileList[NumFiles] := fn;
  305.   Assign(fil,fn);
  306.   Reset(fil);
  307.   St := '';
  308.   k := 0;                                     {block no}
  309.   blockread(fil,block,1,j);                 {read first block}
  310.   while (j=1) do
  311.     begin
  312.       for i := 0 to 127 do                      {128 bytes per block}
  313.        begin
  314.         c:=block[i];
  315.         If ((c >= 'A') and (c < 'z'))
  316.            and ((c >= 'a') or (c <= 'Z'))
  317.           then st := st + UpCase(c)
  318.           else
  319.             if (Ord(St[0]) > 0)
  320.               then
  321.                 begin
  322.                   matchKey := false;
  323.                   case st[1] of
  324.                     'A': matchKey := (st = 'A') or (st = 'AND') or (st = 'AN');
  325.                     'I': matchKey := (st = 'IN') or (st = 'IS') or
  326.                                      (st = 'IT') or (st = 'ITS');
  327.                     'N': matchKey := (st = 'NOT');
  328.                     'O': matchKey := (st = 'OR') or (st = 'ON') or (st = 'OF');
  329.                     'T': matchKey := (st = 'THE') or (st = 'TO');
  330.                     'Y': matchKey := (st = 'YOU');
  331.                   end; (* case *)
  332.                   if matchKey
  333.                     then St := ''
  334.                     else
  335.                       begin
  336.                         New(StPtr);
  337.                         StPtr^.F := NumFiles;
  338.                         StPtr^.FilBlockPtr := k;
  339.                         StPtr^.FilPosPtr :=i-ord(st[0]); {FilePos out}
  340.                         StPtr^.Next := Nil;
  341.                         AddWord(St,StPtr);
  342.                         St := '';
  343.                       end;
  344.                 end;
  345.        end;
  346.       k:=k+1;  {add in 128 bytes per block}
  347.       blockread(fil,block,1,j)    {read next and j=1 if more}
  348.     end;
  349.   Writeln('  Done Inverting ',FN);
  350. End; (* InvertFile *)
  351.  
  352.  
  353.  
  354. (************************************************************************)
  355. (*                                                                      *)
  356. (*                       Main Program Body                              *)
  357. (*                                                                      *)
  358. (************************************************************************)
  359. Begin (* Main Program *)
  360.   Title;
  361.   NumFiles := 0;
  362.   CreateRoot;
  363.   Writeln('Bytes of Available Memory at start: ',MemoryAvail:6:0);
  364.   For i := 1 to ParamCount do
  365.     begin
  366.       InvertFile(ParamStr(i));
  367.       writeln('  Bytes of Available Memory:   ',MemoryAvail:6:0);
  368.     end;
  369.   Writeln('Done inverting, now storing results....');
  370.   PrintResults;
  371. End.
  372.  
  373.  
  374. (**********************************************************************)
  375. (* p.p.s. To reduce mem reqs, I would move "Text:S" to end of record  *)
  376. (* description.  Then, instead of New(recptr) I would use             *)
  377. (* GetMem(recptr,sizeof(recptr^)-WordLength+Length(St)); That way, if *)
  378. (* St='HEAD', you save 10 bytes of memory!  -JimKeo[hane]             *)
  379. (**********************************************************************)
  380.  
  381.